/*******************************************************************************
* Copyright 2018-2020 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "cpu/x64/jit_generator.hpp"

#include "cpu/x64/gemm/s8x8s32/common_u8.hpp"

namespace dnnl {
namespace impl {
namespace cpu {
namespace x64 {

jit_avx512_core_u8_copy_sum_at_kern::jit_avx512_core_u8_copy_sum_at_kern()
    : jit_generator(nullptr, U8_COPY_KERNEL_CODE_SIZE) {}

void jit_avx512_core_u8_copy_sum_at_kern::generate() {

#ifndef _WIN32
#define M rdi
#define N rsi
#define A rdx
#define LDA rcx
#define ALPHA r8
#define B r9

#define I rax
#define A1 r10
#define A2 r8
#define LDA3 r11

#define ARG_BIAS (24 + stacksize + rsp)

#else

#define M rcx
#define N rdx
#define A r8
#define LDA r9
#define ALPHA rax
#define B rdi

#define I rax
#define A1 rsi
#define A2 r10
#define LDA3 r11

#define ARG_ALPHA 40 + stacksize + rsp
#define ARG_B 48 + stacksize + rsp
#define ARG_BIAS 72 + stacksize + rsp

#endif

    inLocalLabel();
    {

        Xbyak::Label l1750;
        Xbyak::Label l1b6c;
        Xbyak::Label l1e14;
        Xbyak::Label l20;
        Xbyak::Label l2068;
        Xbyak::Label l226c;
        Xbyak::Label l22b8;
        Xbyak::Label l22c4;
        Xbyak::Label l22f4;
        Xbyak::Label l26b4;
        Xbyak::Label l28cc;
        Xbyak::Label l2a2c;
        Xbyak::Label l2b5c;
        Xbyak::Label l2c64;
        Xbyak::Label l2c94;
        Xbyak::Label l2ca0;
        Xbyak::Label l2cc8;
        Xbyak::Label l2eac;
        Xbyak::Label l2fc0;
        Xbyak::Label l3078;
        Xbyak::Label l3118;
        Xbyak::Label l319c;
        Xbyak::Label l31c0;
        Xbyak::Label l31cc;
        Xbyak::Label l31ec;
        Xbyak::Label l32e4;
        Xbyak::Label l3378;
        Xbyak::Label l33dc;
        Xbyak::Label l3434;
        Xbyak::Label l347c;
        Xbyak::Label l349c;
        Xbyak::Label l34a8;
        Xbyak::Label l34c8;
        Xbyak::Label l3558;
        Xbyak::Label l35b0;
        Xbyak::Label l35f4;
        Xbyak::Label l3638;
        Xbyak::Label l366c;
        Xbyak::Label l368a;
        Xbyak::Label l3694;
        Xbyak::Label l36a8;
        Xbyak::Label l36ec;
        Xbyak::Label l3728;
        Xbyak::Label l3760;
        Xbyak::Label l3794;
        Xbyak::Label l37b8;
        Xbyak::Label l37d8;
        Xbyak::Label l5cc;
        Xbyak::Label l6c;
        Xbyak::Label l968;
        Xbyak::Label lc80;
        Xbyak::Label lf1c;
        Xbyak::Label lf64;
        Xbyak::Label lf70;
        Xbyak::Label lfb4;

        preamble();
        auto stacksize = get_size_of_abi_save_regs();
#ifdef _WIN32
        mov(ALPHA, ptr[ARG_ALPHA]);
        mov(B, ptr[ARG_B]);
#endif

        bool vnni_ver = mayiuse(avx512_core_vnni);

        static const unsigned one_u4 = 0x01010101u;
        static const int one_s2 = 0x00010001;
        if (vnni_ver) {
            mov(A1, (size_t)&one_u4);
            vbroadcastss(ymm15, ptr[A1]);
        } else {
            mov(A1, (size_t)&one_u4);
            vbroadcastss(zmm15, ptr[A1]);
            mov(A1, (size_t)&one_s2);
            vbroadcastss(zmm14, ptr[A1]);
        }

        mov(N, qword[N]);
        mov(M, qword[M]);
        mov(LDA, qword[LDA]);
        sub(A, -128);
        sub(B, -128);
        lea(LDA3, ptr[LDA + LDA * 2]);
        cmp(N, 0x30);
        jl(lf64, T_NEAR);
        align(4);

        L(l20);
        mov(A1, A);
        mov(I, LDA);
        shl(I, 0x5);
        lea(I, ptr[I + LDA * 8]);
        lea(I, ptr[I + LDA * 8]);
        add(A, I);
        vxorps(ymm8, ymm8, ymm8);
        vxorps(ymm9, ymm9, ymm9);
        vxorps(ymm10, ymm10, ymm10);
        vxorps(ymm11, ymm11, ymm11);
        vxorps(ymm12, ymm12, ymm12);
        vxorps(ymm13, ymm13, ymm13);
        mov(I, M);
        sar(I, 0x3);
        jle(l5cc, T_NEAR);
        align(4);

        L(l6c);
        vmovq(xmm0, qword[A1 - 0x80]);
        vmovq(xmm1, qword[A1 + LDA * 1 - 0x80]);
        vmovq(xmm2, qword[A1 + LDA * 2 - 0x80]);
        vmovq(xmm3, qword[A1 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A1 + LDA * 4]);
        vpunpckldq(xmm1, xmm0, xmm1);
        vpunpckldq(xmm3, xmm2, xmm3);
        vpunpcklqdq(xmm0, xmm1, xmm3);
        vpunpckhqdq(xmm1, xmm1, xmm3);
        vmovq(xmm2, qword[A2 - 0x80]);
        vmovq(xmm3, qword[A2 + LDA * 1 - 0x80]);
        vmovq(xmm4, qword[A2 + LDA * 2 - 0x80]);
        vmovq(xmm5, qword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm3, xmm2, xmm3);
        vpunpckldq(xmm5, xmm4, xmm5);
        vpunpcklqdq(xmm2, xmm3, xmm5);
        vpunpckhqdq(xmm3, xmm3, xmm5);

        vinserti32x4(ymm0, ymm0, xmm2, 1);
        vinserti32x4(ymm1, ymm1, xmm3, 1);

        if (vnni_ver) {
            vpdpbusd(ymm8, ymm15, ymm0);
            vpdpbusd(ymm8, ymm15, ymm1);
        } else {
            vinsertf64x4(zmm0, zmm0, ymm1, 1);
            vpmaddubsw(zmm2, zmm15, zmm0);
            vpmaddwd(zmm2, zmm2, zmm14);
            vpaddd(ymm8, ymm8, ymm2);
            vextractf64x4(ymm3, zmm2, 1);
            vpaddd(ymm8, ymm8, ymm3);
        }
        vmovups(yword[B - 0x80], ymm0);
        vmovups(yword[B + 0x40], ymm1);

        vmovq(xmm0, qword[A2 - 0x80]);
        vmovq(xmm1, qword[A2 + LDA * 1 - 0x80]);
        vmovq(xmm2, qword[A2 + LDA * 2 - 0x80]);
        vmovq(xmm3, qword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm1, xmm0, xmm1);
        vpunpckldq(xmm3, xmm2, xmm3);
        vpunpcklqdq(xmm0, xmm1, xmm3);
        vpunpckhqdq(xmm1, xmm1, xmm3);
        vmovq(xmm2, qword[A2 - 0x80]);
        vmovq(xmm3, qword[A2 + LDA * 1 - 0x80]);
        vmovq(xmm4, qword[A2 + LDA * 2 - 0x80]);
        vmovq(xmm5, qword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm3, xmm2, xmm3);
        vpunpckldq(xmm5, xmm4, xmm5);
        vpunpcklqdq(xmm2, xmm3, xmm5);
        vpunpckhqdq(xmm3, xmm3, xmm5);

        vinserti32x4(ymm0, ymm0, xmm2, 1);
        vinserti32x4(ymm1, ymm1, xmm3, 1);

        if (vnni_ver) {
            vpdpbusd(ymm9, ymm15, ymm0);
            vpdpbusd(ymm9, ymm15, ymm1);
        } else {
            vinsertf64x4(zmm0, zmm0, ymm1, 1);
            vpmaddubsw(zmm2, zmm15, zmm0);
            vpmaddwd(zmm2, zmm2, zmm14);
            vpaddd(ymm9, ymm9, ymm2);
            vextractf64x4(ymm3, zmm2, 1);
            vpaddd(ymm9, ymm9, ymm3);
        }
        vmovups(yword[B - 0x60], ymm0);
        vmovups(yword[B + 0x60], ymm1);

        vmovq(xmm0, qword[A2 - 0x80]);
        vmovq(xmm1, qword[A2 + LDA * 1 - 0x80]);
        vmovq(xmm2, qword[A2 + LDA * 2 - 0x80]);
        vmovq(xmm3, qword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm1, xmm0, xmm1);
        vpunpckldq(xmm3, xmm2, xmm3);
        vpunpcklqdq(xmm0, xmm1, xmm3);
        vpunpckhqdq(xmm1, xmm1, xmm3);
        vmovq(xmm2, qword[A2 - 0x80]);
        vmovq(xmm3, qword[A2 + LDA * 1 - 0x80]);
        vmovq(xmm4, qword[A2 + LDA * 2 - 0x80]);
        vmovq(xmm5, qword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm3, xmm2, xmm3);
        vpunpckldq(xmm5, xmm4, xmm5);
        vpunpcklqdq(xmm2, xmm3, xmm5);
        vpunpckhqdq(xmm3, xmm3, xmm5);

        vinserti32x4(ymm0, ymm0, xmm2, 1);
        vinserti32x4(ymm1, ymm1, xmm3, 1);

        if (vnni_ver) {
            vpdpbusd(ymm10, ymm15, ymm0);
            vpdpbusd(ymm10, ymm15, ymm1);
        } else {
            vinsertf64x4(zmm0, zmm0, ymm1, 1);
            vpmaddubsw(zmm2, zmm15, zmm0);
            vpmaddwd(zmm2, zmm2, zmm14);
            vpaddd(ymm10, ymm10, ymm2);
            vextractf64x4(ymm3, zmm2, 1);
            vpaddd(ymm10, ymm10, ymm3);
        }
        vmovups(yword[B - 0x40], ymm0);
        vmovups(yword[B + 0x80], ymm1);

        vmovq(xmm0, qword[A2 - 0x80]);
        vmovq(xmm1, qword[A2 + LDA * 1 - 0x80]);
        vmovq(xmm2, qword[A2 + LDA * 2 - 0x80]);
        vmovq(xmm3, qword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm1, xmm0, xmm1);
        vpunpckldq(xmm3, xmm2, xmm3);
        vpunpcklqdq(xmm0, xmm1, xmm3);
        vpunpckhqdq(xmm1, xmm1, xmm3);
        vmovq(xmm2, qword[A2 - 0x80]);
        vmovq(xmm3, qword[A2 + LDA * 1 - 0x80]);
        vmovq(xmm4, qword[A2 + LDA * 2 - 0x80]);
        vmovq(xmm5, qword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm3, xmm2, xmm3);
        vpunpckldq(xmm5, xmm4, xmm5);
        vpunpcklqdq(xmm2, xmm3, xmm5);
        vpunpckhqdq(xmm3, xmm3, xmm5);

        vinserti32x4(ymm0, ymm0, xmm2, 1);
        vinserti32x4(ymm1, ymm1, xmm3, 1);

        if (vnni_ver) {
            vpdpbusd(ymm11, ymm15, ymm0);
            vpdpbusd(ymm11, ymm15, ymm1);
        } else {
            vinsertf64x4(zmm0, zmm0, ymm1, 1);
            vpmaddubsw(zmm2, zmm15, zmm0);
            vpmaddwd(zmm2, zmm2, zmm14);
            vpaddd(ymm11, ymm11, ymm2);
            vextractf64x4(ymm3, zmm2, 1);
            vpaddd(ymm11, ymm11, ymm3);
        }
        vmovups(yword[B - 0x20], ymm0);
        vmovups(yword[B + 0xa0], ymm1);

        vmovq(xmm0, qword[A2 - 0x80]);
        vmovq(xmm1, qword[A2 + LDA * 1 - 0x80]);
        vmovq(xmm2, qword[A2 + LDA * 2 - 0x80]);
        vmovq(xmm3, qword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm1, xmm0, xmm1);
        vpunpckldq(xmm3, xmm2, xmm3);
        vpunpcklqdq(xmm0, xmm1, xmm3);
        vpunpckhqdq(xmm1, xmm1, xmm3);
        vmovq(xmm2, qword[A2 - 0x80]);
        vmovq(xmm3, qword[A2 + LDA * 1 - 0x80]);
        vmovq(xmm4, qword[A2 + LDA * 2 - 0x80]);
        vmovq(xmm5, qword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm3, xmm2, xmm3);
        vpunpckldq(xmm5, xmm4, xmm5);
        vpunpcklqdq(xmm2, xmm3, xmm5);
        vpunpckhqdq(xmm3, xmm3, xmm5);

        vinserti32x4(ymm0, ymm0, xmm2, 1);
        vinserti32x4(ymm1, ymm1, xmm3, 1);

        if (vnni_ver) {
            vpdpbusd(ymm12, ymm15, ymm0);
            vpdpbusd(ymm12, ymm15, ymm1);
        } else {
            vinsertf64x4(zmm0, zmm0, ymm1, 1);
            vpmaddubsw(zmm2, zmm15, zmm0);
            vpmaddwd(zmm2, zmm2, zmm14);
            vpaddd(ymm12, ymm12, ymm2);
            vextractf64x4(ymm3, zmm2, 1);
            vpaddd(ymm12, ymm12, ymm3);
        }
        vmovups(yword[B], ymm0);
        vmovups(yword[B + 0xc0], ymm1);

        vmovq(xmm0, qword[A2 - 0x80]);
        vmovq(xmm1, qword[A2 + LDA * 1 - 0x80]);
        vmovq(xmm2, qword[A2 + LDA * 2 - 0x80]);
        vmovq(xmm3, qword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm1, xmm0, xmm1);
        vpunpckldq(xmm3, xmm2, xmm3);
        vpunpcklqdq(xmm0, xmm1, xmm3);
        vpunpckhqdq(xmm1, xmm1, xmm3);
        vmovq(xmm2, qword[A2 - 0x80]);
        vmovq(xmm3, qword[A2 + LDA * 1 - 0x80]);
        vmovq(xmm4, qword[A2 + LDA * 2 - 0x80]);
        vmovq(xmm5, qword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm3, xmm2, xmm3);
        vpunpckldq(xmm5, xmm4, xmm5);
        vpunpcklqdq(xmm2, xmm3, xmm5);
        vpunpckhqdq(xmm3, xmm3, xmm5);

        vinserti32x4(ymm0, ymm0, xmm2, 1);
        vinserti32x4(ymm1, ymm1, xmm3, 1);

        if (vnni_ver) {
            vpdpbusd(ymm13, ymm15, ymm0);
            vpdpbusd(ymm13, ymm15, ymm1);
        } else {
            vinsertf64x4(zmm0, zmm0, ymm1, 1);
            vpmaddubsw(zmm2, zmm15, zmm0);
            vpmaddwd(zmm2, zmm2, zmm14);
            vpaddd(ymm13, ymm13, ymm2);
            vextractf64x4(ymm3, zmm2, 1);
            vpaddd(ymm13, ymm13, ymm3);
        }
        vmovups(yword[B + 0x20], ymm0);
        vmovups(yword[B + 0xe0], ymm1);

        sub(A1, -8);
        sub(B, -384);
        dec(I);
        jg(l6c, T_NEAR);
        align(4);

        L(l5cc);
        test(M, 0x4);
        jle(l968, T_NEAR);
        vmovd(xmm0, dword[A1 - 0x80]);
        vmovd(xmm1, dword[A1 + LDA * 1 - 0x80]);
        vmovd(xmm2, dword[A1 + LDA * 2 - 0x80]);
        vmovd(xmm3, dword[A1 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A1 + LDA * 4]);
        vpunpckldq(xmm0, xmm0, xmm1);
        vpunpckldq(xmm2, xmm2, xmm3);
        vpunpcklqdq(xmm0, xmm0, xmm2);
        vmovdqu(xword[B - 0x80], xmm0);
        vmovd(xmm1, dword[A2 - 0x80]);
        vmovd(xmm2, dword[A2 + LDA * 1 - 0x80]);
        vmovd(xmm3, dword[A2 + LDA * 2 - 0x80]);
        vmovd(xmm4, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm1, xmm1, xmm2);
        vpunpckldq(xmm3, xmm3, xmm4);
        vpunpcklqdq(xmm1, xmm1, xmm3);
        vmovdqu(xword[B - 0x70], xmm1);
        vpmovsxbw(ymm5, xmm0);
        vmovhlps(xmm6, xmm0, xmm0);
        vpmovsxbw(ymm6, xmm6);
        vphaddw(ymm5, ymm5, ymm6);
        vpmovsxbw(ymm6, xmm1);
        vmovhlps(xmm7, xmm1, xmm1);
        vpmovsxbw(ymm7, xmm7);
        vphaddw(ymm6, ymm6, ymm7);
        vphaddw(ymm5, ymm5, ymm6);
        vpmovsxwd(ymm5, xmm5);
        vpaddd(ymm8, ymm8, ymm5);
        vmovd(xmm0, dword[A2 - 0x80]);
        vmovd(xmm1, dword[A2 + LDA * 1 - 0x80]);
        vmovd(xmm2, dword[A2 + LDA * 2 - 0x80]);
        vmovd(xmm3, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm0, xmm0, xmm1);
        vpunpckldq(xmm2, xmm2, xmm3);
        vpunpcklqdq(xmm0, xmm0, xmm2);
        vmovdqu(xword[B - 0x60], xmm0);
        vmovd(xmm1, dword[A2 - 0x80]);
        vmovd(xmm2, dword[A2 + LDA * 1 - 0x80]);
        vmovd(xmm3, dword[A2 + LDA * 2 - 0x80]);
        vmovd(xmm4, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm1, xmm1, xmm2);
        vpunpckldq(xmm3, xmm3, xmm4);
        vpunpcklqdq(xmm1, xmm1, xmm3);
        vmovdqu(xword[B - 0x50], xmm1);
        vpmovsxbw(ymm5, xmm0);
        vmovhlps(xmm6, xmm0, xmm0);
        vpmovsxbw(ymm6, xmm6);
        vphaddw(ymm5, ymm5, ymm6);
        vpmovsxbw(ymm6, xmm1);
        vmovhlps(xmm7, xmm1, xmm1);
        vpmovsxbw(ymm7, xmm7);
        vphaddw(ymm6, ymm6, ymm7);
        vphaddw(ymm5, ymm5, ymm6);
        vpmovsxwd(ymm5, xmm5);
        vpaddd(ymm9, ymm9, ymm5);
        vmovd(xmm0, dword[A2 - 0x80]);
        vmovd(xmm1, dword[A2 + LDA * 1 - 0x80]);
        vmovd(xmm2, dword[A2 + LDA * 2 - 0x80]);
        vmovd(xmm3, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm0, xmm0, xmm1);
        vpunpckldq(xmm2, xmm2, xmm3);
        vpunpcklqdq(xmm0, xmm0, xmm2);
        vmovdqu(xword[B - 0x40], xmm0);
        vmovd(xmm1, dword[A2 - 0x80]);
        vmovd(xmm2, dword[A2 + LDA * 1 - 0x80]);
        vmovd(xmm3, dword[A2 + LDA * 2 - 0x80]);
        vmovd(xmm4, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm1, xmm1, xmm2);
        vpunpckldq(xmm3, xmm3, xmm4);
        vpunpcklqdq(xmm1, xmm1, xmm3);
        vmovdqu(xword[B - 0x30], xmm1);
        vpmovsxbw(ymm5, xmm0);
        vmovhlps(xmm6, xmm0, xmm0);
        vpmovsxbw(ymm6, xmm6);
        vphaddw(ymm5, ymm5, ymm6);
        vpmovsxbw(ymm6, xmm1);
        vmovhlps(xmm7, xmm1, xmm1);
        vpmovsxbw(ymm7, xmm7);
        vphaddw(ymm6, ymm6, ymm7);
        vphaddw(ymm5, ymm5, ymm6);
        vpmovsxwd(ymm5, xmm5);
        vpaddd(ymm10, ymm10, ymm5);
        vmovd(xmm0, dword[A2 - 0x80]);
        vmovd(xmm1, dword[A2 + LDA * 1 - 0x80]);
        vmovd(xmm2, dword[A2 + LDA * 2 - 0x80]);
        vmovd(xmm3, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm0, xmm0, xmm1);
        vpunpckldq(xmm2, xmm2, xmm3);
        vpunpcklqdq(xmm0, xmm0, xmm2);
        vmovdqu(xword[B - 0x20], xmm0);
        vmovd(xmm1, dword[A2 - 0x80]);
        vmovd(xmm2, dword[A2 + LDA * 1 - 0x80]);
        vmovd(xmm3, dword[A2 + LDA * 2 - 0x80]);
        vmovd(xmm4, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm1, xmm1, xmm2);
        vpunpckldq(xmm3, xmm3, xmm4);
        vpunpcklqdq(xmm1, xmm1, xmm3);
        vmovdqu(xword[B - 0x10], xmm1);
        vpmovsxbw(ymm5, xmm0);
        vmovhlps(xmm6, xmm0, xmm0);
        vpmovsxbw(ymm6, xmm6);
        vphaddw(ymm5, ymm5, ymm6);
        vpmovsxbw(ymm6, xmm1);
        vmovhlps(xmm7, xmm1, xmm1);
        vpmovsxbw(ymm7, xmm7);
        vphaddw(ymm6, ymm6, ymm7);
        vphaddw(ymm5, ymm5, ymm6);
        vpmovsxwd(ymm5, xmm5);
        vpaddd(ymm11, ymm11, ymm5);
        vmovd(xmm0, dword[A2 - 0x80]);
        vmovd(xmm1, dword[A2 + LDA * 1 - 0x80]);
        vmovd(xmm2, dword[A2 + LDA * 2 - 0x80]);
        vmovd(xmm3, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm0, xmm0, xmm1);
        vpunpckldq(xmm2, xmm2, xmm3);
        vpunpcklqdq(xmm0, xmm0, xmm2);
        vmovdqu(xword[B], xmm0);
        vmovd(xmm1, dword[A2 - 0x80]);
        vmovd(xmm2, dword[A2 + LDA * 1 - 0x80]);
        vmovd(xmm3, dword[A2 + LDA * 2 - 0x80]);
        vmovd(xmm4, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm1, xmm1, xmm2);
        vpunpckldq(xmm3, xmm3, xmm4);
        vpunpcklqdq(xmm1, xmm1, xmm3);
        vmovdqu(xword[B + 0x10], xmm1);
        vpmovsxbw(ymm5, xmm0);
        vmovhlps(xmm6, xmm0, xmm0);
        vpmovsxbw(ymm6, xmm6);
        vphaddw(ymm5, ymm5, ymm6);
        vpmovsxbw(ymm6, xmm1);
        vmovhlps(xmm7, xmm1, xmm1);
        vpmovsxbw(ymm7, xmm7);
        vphaddw(ymm6, ymm6, ymm7);
        vphaddw(ymm5, ymm5, ymm6);
        vpmovsxwd(ymm5, xmm5);
        vpaddd(ymm12, ymm12, ymm5);
        vmovd(xmm0, dword[A2 - 0x80]);
        vmovd(xmm1, dword[A2 + LDA * 1 - 0x80]);
        vmovd(xmm2, dword[A2 + LDA * 2 - 0x80]);
        vmovd(xmm3, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm0, xmm0, xmm1);
        vpunpckldq(xmm2, xmm2, xmm3);
        vpunpcklqdq(xmm0, xmm0, xmm2);
        vmovdqu(xword[B + 0x20], xmm0);
        vmovd(xmm1, dword[A2 - 0x80]);
        vmovd(xmm2, dword[A2 + LDA * 1 - 0x80]);
        vmovd(xmm3, dword[A2 + LDA * 2 - 0x80]);
        vmovd(xmm4, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpunpckldq(xmm1, xmm1, xmm2);
        vpunpckldq(xmm3, xmm3, xmm4);
        vpunpcklqdq(xmm1, xmm1, xmm3);
        vmovdqu(xword[B + 0x30], xmm1);
        vpmovsxbw(ymm5, xmm0);
        vmovhlps(xmm6, xmm0, xmm0);
        vpmovsxbw(ymm6, xmm6);
        vphaddw(ymm5, ymm5, ymm6);
        vpmovsxbw(ymm6, xmm1);
        vmovhlps(xmm7, xmm1, xmm1);
        vpmovsxbw(ymm7, xmm7);
        vphaddw(ymm6, ymm6, ymm7);
        vphaddw(ymm5, ymm5, ymm6);
        vpmovsxwd(ymm5, xmm5);
        vpaddd(ymm13, ymm13, ymm5);
        sub(A1, -4);
        sub(B, -192);
        align(4);

        L(l968);
        test(M, 0x2);
        jle(lc80, T_NEAR);
        mov(ax, word[A1 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x0);
        mov(ax, word[A1 + LDA * 1 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x1);
        mov(ax, word[A1 + LDA * 2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x2);
        mov(ax, word[A1 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A1 + LDA * 4]);
        vpinsrw(xmm0, xmm0, eax, 0x3);
        mov(ax, word[A2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x4);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x5);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x6);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpinsrw(xmm0, xmm0, eax, 0x7);
        vpmovsxbw(ymm5, xmm0);
        vmovhlps(xmm6, xmm0, xmm0);
        vpmovsxbw(ymm6, xmm6);
        vphaddw(ymm5, ymm5, ymm6);
        vpmovsxwd(ymm5, xmm5);
        vpaddd(ymm8, ymm8, ymm5);
        vmovdqu(xword[B - 0x80], xmm0);
        mov(ax, word[A2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x0);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x1);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x2);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpinsrw(xmm0, xmm0, eax, 0x3);
        mov(ax, word[A2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x4);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x5);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x6);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x7);
        lea(A2, ptr[A2 + LDA * 4]);
        vpmovsxbw(ymm5, xmm0);
        vmovhlps(xmm6, xmm0, xmm0);
        vpmovsxbw(ymm6, xmm6);
        vphaddw(ymm5, ymm5, ymm6);
        vpmovsxwd(ymm5, xmm5);
        vpaddd(ymm9, ymm9, ymm5);
        vmovdqu(xword[B - 0x70], xmm0);
        mov(ax, word[A2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x0);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x1);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x2);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpinsrw(xmm0, xmm0, eax, 0x3);
        mov(ax, word[A2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x4);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x5);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x6);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x7);
        lea(A2, ptr[A2 + LDA * 4]);
        vpmovsxbw(ymm5, xmm0);
        vmovhlps(xmm6, xmm0, xmm0);
        vpmovsxbw(ymm6, xmm6);
        vphaddw(ymm5, ymm5, ymm6);
        vpmovsxwd(ymm5, xmm5);
        vpaddd(ymm10, ymm10, ymm5);
        vmovdqu(xword[B - 0x60], xmm0);
        mov(ax, word[A2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x0);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x1);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x2);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpinsrw(xmm0, xmm0, eax, 0x3);
        mov(ax, word[A2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x4);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x5);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x6);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x7);
        lea(A2, ptr[A2 + LDA * 4]);
        vpmovsxbw(ymm5, xmm0);
        vmovhlps(xmm6, xmm0, xmm0);
        vpmovsxbw(ymm6, xmm6);
        vphaddw(ymm5, ymm5, ymm6);
        vpmovsxwd(ymm5, xmm5);
        vpaddd(ymm11, ymm11, ymm5);
        vmovdqu(xword[B - 0x50], xmm0);
        mov(ax, word[A2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x0);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x1);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x2);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpinsrw(xmm0, xmm0, eax, 0x3);
        mov(ax, word[A2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x4);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x5);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x6);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x7);
        lea(A2, ptr[A2 + LDA * 4]);
        vpmovsxbw(ymm5, xmm0);
        vmovhlps(xmm6, xmm0, xmm0);
        vpmovsxbw(ymm6, xmm6);
        vphaddw(ymm5, ymm5, ymm6);
        vpmovsxwd(ymm5, xmm5);
        vpaddd(ymm12, ymm12, ymm5);
        vmovdqu(xword[B - 0x40], xmm0);
        mov(ax, word[A2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x0);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x1);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x2);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpinsrw(xmm0, xmm0, eax, 0x3);
        mov(ax, word[A2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x4);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x5);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x6);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        vpinsrw(xmm0, xmm0, eax, 0x7);
        lea(A2, ptr[A2 + LDA * 4]);
        vpmovsxbw(ymm5, xmm0);
        vmovhlps(xmm6, xmm0, xmm0);
        vpmovsxbw(ymm6, xmm6);
        vphaddw(ymm5, ymm5, ymm6);
        vpmovsxwd(ymm5, xmm5);
        vpaddd(ymm13, ymm13, ymm5);
        vmovdqu(xword[B - 0x30], xmm0);
        sub(A1, -2);
        sub(B, -96);
        align(4);

        L(lc80);
        test(M, 0x1);
        jle(lf1c, T_NEAR);
        mov(al, byte[A1 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x0);
        mov(al, byte[A1 + LDA * 1 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x1);
        mov(al, byte[A1 + LDA * 2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x2);
        mov(al, byte[A1 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A1 + LDA * 4]);
        vpinsrb(xmm0, xmm0, eax, 0x3);
        mov(al, byte[A2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x4);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x5);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x6);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpinsrb(xmm0, xmm0, eax, 0x7);
        mov(al, byte[A2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x8);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x9);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0xa);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpinsrb(xmm0, xmm0, eax, 0xb);
        mov(al, byte[A2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0xc);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0xd);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0xe);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpinsrb(xmm0, xmm0, eax, 0xf);
        vpmovsxbd(ymm7, xmm0);
        vpaddd(ymm8, ymm8, ymm7);
        vmovhlps(xmm7, xmm0, xmm0);
        vpmovsxbd(ymm7, xmm7);
        vpaddd(ymm9, ymm9, ymm7);
        vmovdqu(xword[B - 0x80], xmm0);
        mov(al, byte[A2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x0);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x1);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x2);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpinsrb(xmm0, xmm0, eax, 0x3);
        mov(al, byte[A2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x4);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x5);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x6);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpinsrb(xmm0, xmm0, eax, 0x7);
        mov(al, byte[A2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x8);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x9);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0xa);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpinsrb(xmm0, xmm0, eax, 0xb);
        mov(al, byte[A2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0xc);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0xd);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0xe);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpinsrb(xmm0, xmm0, eax, 0xf);
        vpmovsxbd(ymm7, xmm0);
        vpaddd(ymm10, ymm10, ymm7);
        vmovhlps(xmm7, xmm0, xmm0);
        vpmovsxbd(ymm7, xmm7);
        vpaddd(ymm11, ymm11, ymm7);
        vmovdqu(xword[B - 0x70], xmm0);
        mov(al, byte[A2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x0);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x1);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x2);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpinsrb(xmm0, xmm0, eax, 0x3);
        mov(al, byte[A2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x4);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x5);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x6);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpinsrb(xmm0, xmm0, eax, 0x7);
        mov(al, byte[A2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x8);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0x9);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0xa);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpinsrb(xmm0, xmm0, eax, 0xb);
        mov(al, byte[A2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0xc);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0xd);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        vpinsrb(xmm0, xmm0, eax, 0xe);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        vpinsrb(xmm0, xmm0, eax, 0xf);
        vpmovsxbd(ymm7, xmm0);
        vpaddd(ymm12, ymm12, ymm7);
        vmovhlps(xmm7, xmm0, xmm0);
        vpmovsxbd(ymm7, xmm7);
        vpaddd(ymm13, ymm13, ymm7);
        vmovdqu(xword[B - 0x60], xmm0);
        sub(B, -48);
        align(4);

        L(lf1c);
        mov(A1, qword[ARG_BIAS]);
        vmovdqu(yword[A1], ymm8);
        vmovdqu(yword[A1 + 0x20], ymm9);
        vmovdqu(yword[A1 + 0x40], ymm10);
        vmovdqu(yword[A1 + 0x60], ymm11);
        vmovdqu(yword[A1 + 0x80], ymm12);
        vmovdqu(yword[A1 + 0xa0], ymm13);
        add(qword[ARG_BIAS], 0xc0);
        sub(N, 0x30);
        cmp(N, 0x30);
        jge(l20, T_NEAR);
        vzeroupper();
        align(4);

        L(lf64);
        cmp(N, 0x20);
        jl(l22b8, T_NEAR);
        align(4);

        L(lf70);
        mov(A1, A);
        mov(I, LDA);
        shl(I, 0x5);
        add(A, I);
        pxor(xmm8, xmm8);
        pxor(xmm9, xmm9);
        pxor(xmm10, xmm10);
        pxor(xmm11, xmm11);
        pxor(xmm12, xmm12);
        pxor(xmm13, xmm13);
        pxor(xmm14, xmm14);
        pxor(xmm15, xmm15);
        mov(I, M);
        sar(I, 0x4);
        jle(l1750, T_NEAR);
        align(4);

        L(lfb4);
        movdqu(xmm0, xword[A1 - 0x80]);
        movdqu(xmm1, xword[A1 + LDA * 1 - 0x80]);
        movdqu(xmm2, xword[A1 + LDA * 2 - 0x80]);
        movdqu(xmm3, xword[A1 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A1 + LDA * 4]);
        movdqa(xmm4, xmm0);
        punpckldq(xmm0, xmm1);
        punpckhdq(xmm4, xmm1);
        movdqa(xmm5, xmm2);
        punpckldq(xmm2, xmm3);
        punpckhdq(xmm5, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        movdqa(xmm3, xmm4);
        punpcklqdq(xmm4, xmm5);
        punpckhqdq(xmm3, xmm5);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B - 0x80], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B], xmm1);
        pmovsxbw(xmm5, xmm4);
        movhlps(xmm6, xmm4);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B + 0x80], xmm4);
        pmovsxbw(xmm5, xmm3);
        movhlps(xmm6, xmm3);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B + 0x100], xmm3);
        movdqu(xmm0, xword[A2 - 0x80]);
        movdqu(xmm1, xword[A2 + LDA * 1 - 0x80]);
        movdqu(xmm2, xword[A2 + LDA * 2 - 0x80]);
        movdqu(xmm3, xword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        movdqa(xmm4, xmm0);
        punpckldq(xmm0, xmm1);
        punpckhdq(xmm4, xmm1);
        movdqa(xmm5, xmm2);
        punpckldq(xmm2, xmm3);
        punpckhdq(xmm5, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        movdqa(xmm3, xmm4);
        punpcklqdq(xmm4, xmm5);
        punpckhqdq(xmm3, xmm5);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B - 0x70], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B + 0x10], xmm1);
        pmovsxbw(xmm5, xmm4);
        movhlps(xmm6, xmm4);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B + 0x90], xmm4);
        pmovsxbw(xmm5, xmm3);
        movhlps(xmm6, xmm3);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B + 0x110], xmm3);
        movdqu(xmm0, xword[A2 - 0x80]);
        movdqu(xmm1, xword[A2 + LDA * 1 - 0x80]);
        movdqu(xmm2, xword[A2 + LDA * 2 - 0x80]);
        movdqu(xmm3, xword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        movdqa(xmm4, xmm0);
        punpckldq(xmm0, xmm1);
        punpckhdq(xmm4, xmm1);
        movdqa(xmm5, xmm2);
        punpckldq(xmm2, xmm3);
        punpckhdq(xmm5, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        movdqa(xmm3, xmm4);
        punpcklqdq(xmm4, xmm5);
        punpckhqdq(xmm3, xmm5);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm10, xmm5);
        movdqu(xword[B - 0x60], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm10, xmm5);
        movdqu(xword[B + 0x20], xmm1);
        pmovsxbw(xmm5, xmm4);
        movhlps(xmm6, xmm4);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm10, xmm5);
        movdqu(xword[B + 0xa0], xmm4);
        pmovsxbw(xmm5, xmm3);
        movhlps(xmm6, xmm3);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm10, xmm5);
        movdqu(xword[B + 0x120], xmm3);
        movdqu(xmm0, xword[A2 - 0x80]);
        movdqu(xmm1, xword[A2 + LDA * 1 - 0x80]);
        movdqu(xmm2, xword[A2 + LDA * 2 - 0x80]);
        movdqu(xmm3, xword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        movdqa(xmm4, xmm0);
        punpckldq(xmm0, xmm1);
        punpckhdq(xmm4, xmm1);
        movdqa(xmm5, xmm2);
        punpckldq(xmm2, xmm3);
        punpckhdq(xmm5, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        movdqa(xmm3, xmm4);
        punpcklqdq(xmm4, xmm5);
        punpckhqdq(xmm3, xmm5);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm11, xmm5);
        movdqu(xword[B - 0x50], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm11, xmm5);
        movdqu(xword[B + 0x30], xmm1);
        pmovsxbw(xmm5, xmm4);
        movhlps(xmm6, xmm4);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm11, xmm5);
        movdqu(xword[B + 0xb0], xmm4);
        pmovsxbw(xmm5, xmm3);
        movhlps(xmm6, xmm3);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm11, xmm5);
        movdqu(xword[B + 0x130], xmm3);
        movdqu(xmm0, xword[A2 - 0x80]);
        movdqu(xmm1, xword[A2 + LDA * 1 - 0x80]);
        movdqu(xmm2, xword[A2 + LDA * 2 - 0x80]);
        movdqu(xmm3, xword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        movdqa(xmm4, xmm0);
        punpckldq(xmm0, xmm1);
        punpckhdq(xmm4, xmm1);
        movdqa(xmm5, xmm2);
        punpckldq(xmm2, xmm3);
        punpckhdq(xmm5, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        movdqa(xmm3, xmm4);
        punpcklqdq(xmm4, xmm5);
        punpckhqdq(xmm3, xmm5);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm12, xmm5);
        movdqu(xword[B - 0x40], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm12, xmm5);
        movdqu(xword[B + 0x40], xmm1);
        pmovsxbw(xmm5, xmm4);
        movhlps(xmm6, xmm4);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm12, xmm5);
        movdqu(xword[B + 0xc0], xmm4);
        pmovsxbw(xmm5, xmm3);
        movhlps(xmm6, xmm3);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm12, xmm5);
        movdqu(xword[B + 0x140], xmm3);
        movdqu(xmm0, xword[A2 - 0x80]);
        movdqu(xmm1, xword[A2 + LDA * 1 - 0x80]);
        movdqu(xmm2, xword[A2 + LDA * 2 - 0x80]);
        movdqu(xmm3, xword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        movdqa(xmm4, xmm0);
        punpckldq(xmm0, xmm1);
        punpckhdq(xmm4, xmm1);
        movdqa(xmm5, xmm2);
        punpckldq(xmm2, xmm3);
        punpckhdq(xmm5, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        movdqa(xmm3, xmm4);
        punpcklqdq(xmm4, xmm5);
        punpckhqdq(xmm3, xmm5);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm13, xmm5);
        movdqu(xword[B - 0x30], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm13, xmm5);
        movdqu(xword[B + 0x50], xmm1);
        pmovsxbw(xmm5, xmm4);
        movhlps(xmm6, xmm4);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm13, xmm5);
        movdqu(xword[B + 0xd0], xmm4);
        pmovsxbw(xmm5, xmm3);
        movhlps(xmm6, xmm3);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm13, xmm5);
        movdqu(xword[B + 0x150], xmm3);
        movdqu(xmm0, xword[A2 - 0x80]);
        movdqu(xmm1, xword[A2 + LDA * 1 - 0x80]);
        movdqu(xmm2, xword[A2 + LDA * 2 - 0x80]);
        movdqu(xmm3, xword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        movdqa(xmm4, xmm0);
        punpckldq(xmm0, xmm1);
        punpckhdq(xmm4, xmm1);
        movdqa(xmm5, xmm2);
        punpckldq(xmm2, xmm3);
        punpckhdq(xmm5, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        movdqa(xmm3, xmm4);
        punpcklqdq(xmm4, xmm5);
        punpckhqdq(xmm3, xmm5);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm14, xmm5);
        movdqu(xword[B - 0x20], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm14, xmm5);
        movdqu(xword[B + 0x60], xmm1);
        pmovsxbw(xmm5, xmm4);
        movhlps(xmm6, xmm4);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm14, xmm5);
        movdqu(xword[B + 0xe0], xmm4);
        pmovsxbw(xmm5, xmm3);
        movhlps(xmm6, xmm3);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm14, xmm5);
        movdqu(xword[B + 0x160], xmm3);
        movdqu(xmm0, xword[A2 - 0x80]);
        movdqu(xmm1, xword[A2 + LDA * 1 - 0x80]);
        movdqu(xmm2, xword[A2 + LDA * 2 - 0x80]);
        movdqu(xmm3, xword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        movdqa(xmm4, xmm0);
        punpckldq(xmm0, xmm1);
        punpckhdq(xmm4, xmm1);
        movdqa(xmm5, xmm2);
        punpckldq(xmm2, xmm3);
        punpckhdq(xmm5, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        movdqa(xmm3, xmm4);
        punpcklqdq(xmm4, xmm5);
        punpckhqdq(xmm3, xmm5);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm15, xmm5);
        movdqu(xword[B - 0x10], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm15, xmm5);
        movdqu(xword[B + 0x70], xmm1);
        pmovsxbw(xmm5, xmm4);
        movhlps(xmm6, xmm4);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm15, xmm5);
        movdqu(xword[B + 0xf0], xmm4);
        pmovsxbw(xmm5, xmm3);
        movhlps(xmm6, xmm3);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm15, xmm5);
        movdqu(xword[B + 0x170], xmm3);
        sub(A1, -16);
        sub(B, -512);
        dec(I);
        jg(lfb4, T_NEAR);
        align(4);

        L(l1750);
        test(M, 0x8);
        jle(l1b6c, T_NEAR);
        movq(xmm0, qword[A1 - 0x80]);
        movq(xmm1, qword[A1 + LDA * 1 - 0x80]);
        movq(xmm2, qword[A1 + LDA * 2 - 0x80]);
        movq(xmm3, qword[A1 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A1 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B - 0x80], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B], xmm1);
        movq(xmm0, qword[A2 - 0x80]);
        movq(xmm1, qword[A2 + LDA * 1 - 0x80]);
        movq(xmm2, qword[A2 + LDA * 2 - 0x80]);
        movq(xmm3, qword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B - 0x70], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B + 0x10], xmm1);
        movq(xmm0, qword[A2 - 0x80]);
        movq(xmm1, qword[A2 + LDA * 1 - 0x80]);
        movq(xmm2, qword[A2 + LDA * 2 - 0x80]);
        movq(xmm3, qword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm10, xmm5);
        movdqu(xword[B - 0x60], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm10, xmm5);
        movdqu(xword[B + 0x20], xmm1);
        movq(xmm0, qword[A2 - 0x80]);
        movq(xmm1, qword[A2 + LDA * 1 - 0x80]);
        movq(xmm2, qword[A2 + LDA * 2 - 0x80]);
        movq(xmm3, qword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm11, xmm5);
        movdqu(xword[B - 0x50], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm11, xmm5);
        movdqu(xword[B + 0x30], xmm1);
        movq(xmm0, qword[A2 - 0x80]);
        movq(xmm1, qword[A2 + LDA * 1 - 0x80]);
        movq(xmm2, qword[A2 + LDA * 2 - 0x80]);
        movq(xmm3, qword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm12, xmm5);
        movdqu(xword[B - 0x40], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm12, xmm5);
        movdqu(xword[B + 0x40], xmm1);
        movq(xmm0, qword[A2 - 0x80]);
        movq(xmm1, qword[A2 + LDA * 1 - 0x80]);
        movq(xmm2, qword[A2 + LDA * 2 - 0x80]);
        movq(xmm3, qword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm13, xmm5);
        movdqu(xword[B - 0x30], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm13, xmm5);
        movdqu(xword[B + 0x50], xmm1);
        movq(xmm0, qword[A2 - 0x80]);
        movq(xmm1, qword[A2 + LDA * 1 - 0x80]);
        movq(xmm2, qword[A2 + LDA * 2 - 0x80]);
        movq(xmm3, qword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm14, xmm5);
        movdqu(xword[B - 0x20], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm14, xmm5);
        movdqu(xword[B + 0x60], xmm1);
        movq(xmm0, qword[A2 - 0x80]);
        movq(xmm1, qword[A2 + LDA * 1 - 0x80]);
        movq(xmm2, qword[A2 + LDA * 2 - 0x80]);
        movq(xmm3, qword[A2 + LDA3 * 1 - 0x80]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm15, xmm5);
        movdqu(xword[B - 0x10], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm15, xmm5);
        movdqu(xword[B + 0x70], xmm1);
        sub(A1, -8);
        sub(B, -256);
        align(4);

        L(l1b6c);
        test(M, 0x4);
        jle(l1e14, T_NEAR);
        movd(xmm0, dword[A1 - 0x80]);
        movd(xmm1, dword[A1 + LDA * 1 - 0x80]);
        movd(xmm2, dword[A1 + LDA * 2 - 0x80]);
        movd(xmm3, dword[A1 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A1 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        punpcklqdq(xmm0, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B - 0x80], xmm0);
        movd(xmm0, dword[A2 - 0x80]);
        movd(xmm1, dword[A2 + LDA * 1 - 0x80]);
        movd(xmm2, dword[A2 + LDA * 2 - 0x80]);
        movd(xmm3, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        punpcklqdq(xmm0, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B - 0x70], xmm0);
        movd(xmm0, dword[A2 - 0x80]);
        movd(xmm1, dword[A2 + LDA * 1 - 0x80]);
        movd(xmm2, dword[A2 + LDA * 2 - 0x80]);
        movd(xmm3, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        punpcklqdq(xmm0, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm10, xmm5);
        movdqu(xword[B - 0x60], xmm0);
        movd(xmm0, dword[A2 - 0x80]);
        movd(xmm1, dword[A2 + LDA * 1 - 0x80]);
        movd(xmm2, dword[A2 + LDA * 2 - 0x80]);
        movd(xmm3, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        punpcklqdq(xmm0, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm11, xmm5);
        movdqu(xword[B - 0x50], xmm0);
        movd(xmm0, dword[A2 - 0x80]);
        movd(xmm1, dword[A2 + LDA * 1 - 0x80]);
        movd(xmm2, dword[A2 + LDA * 2 - 0x80]);
        movd(xmm3, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        punpcklqdq(xmm0, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm12, xmm5);
        movdqu(xword[B - 0x40], xmm0);
        movd(xmm0, dword[A2 - 0x80]);
        movd(xmm1, dword[A2 + LDA * 1 - 0x80]);
        movd(xmm2, dword[A2 + LDA * 2 - 0x80]);
        movd(xmm3, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        punpcklqdq(xmm0, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm13, xmm5);
        movdqu(xword[B - 0x30], xmm0);
        movd(xmm0, dword[A2 - 0x80]);
        movd(xmm1, dword[A2 + LDA * 1 - 0x80]);
        movd(xmm2, dword[A2 + LDA * 2 - 0x80]);
        movd(xmm3, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        punpcklqdq(xmm0, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm14, xmm5);
        movdqu(xword[B - 0x20], xmm0);
        movd(xmm0, dword[A2 - 0x80]);
        movd(xmm1, dword[A2 + LDA * 1 - 0x80]);
        movd(xmm2, dword[A2 + LDA * 2 - 0x80]);
        movd(xmm3, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        punpcklqdq(xmm0, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm15, xmm5);
        movdqu(xword[B - 0x10], xmm0);
        sub(A1, -4);
        sub(B, -128);
        align(4);

        L(l1e14);
        test(M, 0x2);
        jle(l2068, T_NEAR);
        mov(ax, word[A1 - 0x80]);
        pinsrw(xmm0, eax, 0x0);
        mov(ax, word[A1 + LDA * 1 - 0x80]);
        pinsrw(xmm0, eax, 0x1);
        mov(ax, word[A1 + LDA * 2 - 0x80]);
        pinsrw(xmm0, eax, 0x2);
        mov(ax, word[A1 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A1 + LDA * 4]);
        pinsrw(xmm0, eax, 0x3);
        mov(ax, word[A2 - 0x80]);
        pinsrw(xmm0, eax, 0x4);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        pinsrw(xmm0, eax, 0x5);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        pinsrw(xmm0, eax, 0x6);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        pinsrw(xmm0, eax, 0x7);
        pmovsxbw(xmm5, xmm0);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm6, xmm6);
        pmovsxwd(xmm6, xmm6);
        paddd(xmm9, xmm6);
        movdqu(xword[B - 0x80], xmm0);
        mov(ax, word[A2 - 0x80]);
        pinsrw(xmm0, eax, 0x0);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        pinsrw(xmm0, eax, 0x1);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        pinsrw(xmm0, eax, 0x2);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        pinsrw(xmm0, eax, 0x3);
        mov(ax, word[A2 - 0x80]);
        pinsrw(xmm0, eax, 0x4);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        pinsrw(xmm0, eax, 0x5);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        pinsrw(xmm0, eax, 0x6);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        pinsrw(xmm0, eax, 0x7);
        lea(A2, ptr[A2 + LDA * 4]);
        pmovsxbw(xmm5, xmm0);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm10, xmm5);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm6, xmm6);
        pmovsxwd(xmm6, xmm6);
        paddd(xmm11, xmm6);
        movdqu(xword[B - 0x70], xmm0);
        mov(ax, word[A2 - 0x80]);
        pinsrw(xmm0, eax, 0x0);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        pinsrw(xmm0, eax, 0x1);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        pinsrw(xmm0, eax, 0x2);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        pinsrw(xmm0, eax, 0x3);
        mov(ax, word[A2 - 0x80]);
        pinsrw(xmm0, eax, 0x4);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        pinsrw(xmm0, eax, 0x5);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        pinsrw(xmm0, eax, 0x6);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        pinsrw(xmm0, eax, 0x7);
        lea(A2, ptr[A2 + LDA * 4]);
        pmovsxbw(xmm5, xmm0);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm12, xmm5);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm6, xmm6);
        pmovsxwd(xmm6, xmm6);
        paddd(xmm13, xmm6);
        movdqu(xword[B - 0x60], xmm0);
        mov(ax, word[A2 - 0x80]);
        pinsrw(xmm0, eax, 0x0);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        pinsrw(xmm0, eax, 0x1);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        pinsrw(xmm0, eax, 0x2);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        pinsrw(xmm0, eax, 0x3);
        mov(ax, word[A2 - 0x80]);
        pinsrw(xmm0, eax, 0x4);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        pinsrw(xmm0, eax, 0x5);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        pinsrw(xmm0, eax, 0x6);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        pinsrw(xmm0, eax, 0x7);
        lea(A2, ptr[A2 + LDA * 4]);
        pmovsxbw(xmm5, xmm0);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm14, xmm5);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm6, xmm6);
        pmovsxwd(xmm6, xmm6);
        paddd(xmm15, xmm6);
        movdqu(xword[B - 0x50], xmm0);
        sub(A1, -2);
        sub(B, -64);
        align(4);

        L(l2068);
        test(M, 0x1);
        jle(l226c, T_NEAR);
        mov(al, byte[A1 - 0x80]);
        pinsrb(xmm0, eax, 0x0);
        mov(al, byte[A1 + LDA * 1 - 0x80]);
        pinsrb(xmm0, eax, 0x1);
        mov(al, byte[A1 + LDA * 2 - 0x80]);
        pinsrb(xmm0, eax, 0x2);
        mov(al, byte[A1 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A1 + LDA * 4]);
        pinsrb(xmm0, eax, 0x3);
        mov(al, byte[A2 - 0x80]);
        pinsrb(xmm0, eax, 0x4);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        pinsrb(xmm0, eax, 0x5);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        pinsrb(xmm0, eax, 0x6);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        pinsrb(xmm0, eax, 0x7);
        mov(al, byte[A2 - 0x80]);
        pinsrb(xmm0, eax, 0x8);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        pinsrb(xmm0, eax, 0x9);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        pinsrb(xmm0, eax, 0xa);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        pinsrb(xmm0, eax, 0xb);
        mov(al, byte[A2 - 0x80]);
        pinsrb(xmm0, eax, 0xc);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        pinsrb(xmm0, eax, 0xd);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        pinsrb(xmm0, eax, 0xe);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        pinsrb(xmm0, eax, 0xf);
        pmovsxbd(xmm5, xmm0);
        paddd(xmm8, xmm5);
        pshufd(xmm6, xmm0, 0x55);
        pmovsxbd(xmm6, xmm6);
        paddd(xmm9, xmm6);
        pshufd(xmm5, xmm0, 0xaa);
        pmovsxbd(xmm5, xmm5);
        paddd(xmm10, xmm5);
        pshufd(xmm6, xmm0, 0xff);
        pmovsxbd(xmm6, xmm6);
        paddd(xmm11, xmm6);
        movdqu(xword[B - 0x80], xmm0);
        mov(al, byte[A2 - 0x80]);
        pinsrb(xmm0, eax, 0x0);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        pinsrb(xmm0, eax, 0x1);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        pinsrb(xmm0, eax, 0x2);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        pinsrb(xmm0, eax, 0x3);
        mov(al, byte[A2 - 0x80]);
        pinsrb(xmm0, eax, 0x4);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        pinsrb(xmm0, eax, 0x5);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        pinsrb(xmm0, eax, 0x6);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        pinsrb(xmm0, eax, 0x7);
        mov(al, byte[A2 - 0x80]);
        pinsrb(xmm0, eax, 0x8);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        pinsrb(xmm0, eax, 0x9);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        pinsrb(xmm0, eax, 0xa);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        pinsrb(xmm0, eax, 0xb);
        mov(al, byte[A2 - 0x80]);
        pinsrb(xmm0, eax, 0xc);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        pinsrb(xmm0, eax, 0xd);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        pinsrb(xmm0, eax, 0xe);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        pinsrb(xmm0, eax, 0xf);
        pmovsxbd(xmm5, xmm0);
        paddd(xmm12, xmm5);
        pshufd(xmm6, xmm0, 0x55);
        pmovsxbd(xmm6, xmm6);
        paddd(xmm13, xmm6);
        pshufd(xmm5, xmm0, 0xaa);
        pmovsxbd(xmm5, xmm5);
        paddd(xmm14, xmm5);
        pshufd(xmm6, xmm0, 0xff);
        pmovsxbd(xmm6, xmm6);
        paddd(xmm15, xmm6);
        movdqu(xword[B - 0x70], xmm0);
        sub(B, -32);
        align(4);

        L(l226c);
        mov(A1, qword[ARG_BIAS]);
        movdqu(xword[A1], xmm8);
        movdqu(xword[A1 + 0x10], xmm9);
        movdqu(xword[A1 + 0x20], xmm10);
        movdqu(xword[A1 + 0x30], xmm11);
        movdqu(xword[A1 + 0x40], xmm12);
        movdqu(xword[A1 + 0x50], xmm13);
        movdqu(xword[A1 + 0x60], xmm14);
        movdqu(xword[A1 + 0x70], xmm15);
        add(qword[ARG_BIAS], 0x80);
        sub(N, 0x20);
        cmp(N, 0x20);
        jge(lf70, T_NEAR);
        align(4);

        L(l22b8);
        cmp(N, 0x10);
        jl(l2c94, T_NEAR);
        align(4);

        L(l22c4);
        mov(A1, A);
        mov(I, LDA);
        shl(I, 0x4);
        add(A, I);
        pxor(xmm8, xmm8);
        pxor(xmm9, xmm9);
        pxor(xmm10, xmm10);
        pxor(xmm11, xmm11);
        mov(I, M);
        sar(I, 0x4);
        jle(l26b4, T_NEAR);
        align(4);

        L(l22f4);
        movdqu(xmm0, xword[A1 - 0x80]);
        movdqu(xmm1, xword[A1 + LDA * 1 - 0x80]);
        movdqu(xmm2, xword[A1 + LDA * 2 - 0x80]);
        movdqu(xmm3, xword[A1 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A1 + LDA * 4]);
        movdqa(xmm4, xmm0);
        punpckldq(xmm0, xmm1);
        punpckhdq(xmm4, xmm1);
        movdqa(xmm5, xmm2);
        punpckldq(xmm2, xmm3);
        punpckhdq(xmm5, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        movdqa(xmm3, xmm4);
        punpcklqdq(xmm4, xmm5);
        punpckhqdq(xmm3, xmm5);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B - 0x80], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B - 0x40], xmm1);
        pmovsxbw(xmm5, xmm4);
        movhlps(xmm6, xmm4);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B], xmm4);
        pmovsxbw(xmm5, xmm3);
        movhlps(xmm6, xmm3);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B + 0x40], xmm3);
        movdqu(xmm0, xword[A2 - 0x80]);
        movdqu(xmm1, xword[A2 + LDA * 1 - 0x80]);
        movdqu(xmm2, xword[A2 + LDA * 2 - 0x80]);
        movdqu(xmm3, xword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        movdqa(xmm4, xmm0);
        punpckldq(xmm0, xmm1);
        punpckhdq(xmm4, xmm1);
        movdqa(xmm5, xmm2);
        punpckldq(xmm2, xmm3);
        punpckhdq(xmm5, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        movdqa(xmm3, xmm4);
        punpcklqdq(xmm4, xmm5);
        punpckhqdq(xmm3, xmm5);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B - 0x70], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B - 0x30], xmm1);
        pmovsxbw(xmm5, xmm4);
        movhlps(xmm6, xmm4);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B + 0x10], xmm4);
        pmovsxbw(xmm5, xmm3);
        movhlps(xmm6, xmm3);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B + 0x50], xmm3);
        movdqu(xmm0, xword[A2 - 0x80]);
        movdqu(xmm1, xword[A2 + LDA * 1 - 0x80]);
        movdqu(xmm2, xword[A2 + LDA * 2 - 0x80]);
        movdqu(xmm3, xword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        movdqa(xmm4, xmm0);
        punpckldq(xmm0, xmm1);
        punpckhdq(xmm4, xmm1);
        movdqa(xmm5, xmm2);
        punpckldq(xmm2, xmm3);
        punpckhdq(xmm5, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        movdqa(xmm3, xmm4);
        punpcklqdq(xmm4, xmm5);
        punpckhqdq(xmm3, xmm5);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm10, xmm5);
        movdqu(xword[B - 0x60], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm10, xmm5);
        movdqu(xword[B - 0x20], xmm1);
        pmovsxbw(xmm5, xmm4);
        movhlps(xmm6, xmm4);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm10, xmm5);
        movdqu(xword[B + 0x20], xmm4);
        pmovsxbw(xmm5, xmm3);
        movhlps(xmm6, xmm3);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm10, xmm5);
        movdqu(xword[B + 0x60], xmm3);
        movdqu(xmm0, xword[A2 - 0x80]);
        movdqu(xmm1, xword[A2 + LDA * 1 - 0x80]);
        movdqu(xmm2, xword[A2 + LDA * 2 - 0x80]);
        movdqu(xmm3, xword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        movdqa(xmm4, xmm0);
        punpckldq(xmm0, xmm1);
        punpckhdq(xmm4, xmm1);
        movdqa(xmm5, xmm2);
        punpckldq(xmm2, xmm3);
        punpckhdq(xmm5, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        movdqa(xmm3, xmm4);
        punpcklqdq(xmm4, xmm5);
        punpckhqdq(xmm3, xmm5);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm11, xmm5);
        movdqu(xword[B - 0x50], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm11, xmm5);
        movdqu(xword[B - 0x10], xmm1);
        pmovsxbw(xmm5, xmm4);
        movhlps(xmm6, xmm4);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm11, xmm5);
        movdqu(xword[B + 0x30], xmm4);
        pmovsxbw(xmm5, xmm3);
        movhlps(xmm6, xmm3);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm11, xmm5);
        movdqu(xword[B + 0x70], xmm3);
        sub(A1, -16);
        sub(B, -256);
        dec(I);
        jg(l22f4, T_NEAR);
        align(4);

        L(l26b4);
        test(M, 0x8);
        jle(l28cc, T_NEAR);
        movq(xmm0, qword[A1 - 0x80]);
        movq(xmm1, qword[A1 + LDA * 1 - 0x80]);
        movq(xmm2, qword[A1 + LDA * 2 - 0x80]);
        movq(xmm3, qword[A1 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A1 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B - 0x80], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B - 0x40], xmm1);
        movq(xmm0, qword[A2 - 0x80]);
        movq(xmm1, qword[A2 + LDA * 1 - 0x80]);
        movq(xmm2, qword[A2 + LDA * 2 - 0x80]);
        movq(xmm3, qword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B - 0x70], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B - 0x30], xmm1);
        movq(xmm0, qword[A2 - 0x80]);
        movq(xmm1, qword[A2 + LDA * 1 - 0x80]);
        movq(xmm2, qword[A2 + LDA * 2 - 0x80]);
        movq(xmm3, qword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm10, xmm5);
        movdqu(xword[B - 0x60], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm10, xmm5);
        movdqu(xword[B - 0x20], xmm1);
        movq(xmm0, qword[A2 - 0x80]);
        movq(xmm1, qword[A2 + LDA * 1 - 0x80]);
        movq(xmm2, qword[A2 + LDA * 2 - 0x80]);
        movq(xmm3, qword[A2 + LDA3 * 1 - 0x80]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm11, xmm5);
        movdqu(xword[B - 0x50], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm11, xmm5);
        movdqu(xword[B - 0x10], xmm1);
        sub(A1, -8);
        sub(B, -128);
        align(4);

        L(l28cc);
        test(M, 0x4);
        jle(l2a2c, T_NEAR);
        movd(xmm0, dword[A1 - 0x80]);
        movd(xmm1, dword[A1 + LDA * 1 - 0x80]);
        movd(xmm2, dword[A1 + LDA * 2 - 0x80]);
        movd(xmm3, dword[A1 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A1 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        punpcklqdq(xmm0, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B - 0x80], xmm0);
        movd(xmm0, dword[A2 - 0x80]);
        movd(xmm1, dword[A2 + LDA * 1 - 0x80]);
        movd(xmm2, dword[A2 + LDA * 2 - 0x80]);
        movd(xmm3, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        punpcklqdq(xmm0, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B - 0x70], xmm0);
        movd(xmm0, dword[A2 - 0x80]);
        movd(xmm1, dword[A2 + LDA * 1 - 0x80]);
        movd(xmm2, dword[A2 + LDA * 2 - 0x80]);
        movd(xmm3, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        punpcklqdq(xmm0, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm10, xmm5);
        movdqu(xword[B - 0x60], xmm0);
        movd(xmm0, dword[A2 - 0x80]);
        movd(xmm1, dword[A2 + LDA * 1 - 0x80]);
        movd(xmm2, dword[A2 + LDA * 2 - 0x80]);
        movd(xmm3, dword[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        punpcklqdq(xmm0, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm11, xmm5);
        movdqu(xword[B - 0x50], xmm0);
        sub(A1, -4);
        sub(B, -64);
        align(4);

        L(l2a2c);
        test(M, 0x2);
        jle(l2b5c, T_NEAR);
        mov(ax, word[A1 - 0x80]);
        pinsrw(xmm0, eax, 0x0);
        mov(ax, word[A1 + LDA * 1 - 0x80]);
        pinsrw(xmm0, eax, 0x1);
        mov(ax, word[A1 + LDA * 2 - 0x80]);
        pinsrw(xmm0, eax, 0x2);
        mov(ax, word[A1 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A1 + LDA * 4]);
        pinsrw(xmm0, eax, 0x3);
        mov(ax, word[A2 - 0x80]);
        pinsrw(xmm0, eax, 0x4);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        pinsrw(xmm0, eax, 0x5);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        pinsrw(xmm0, eax, 0x6);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        pinsrw(xmm0, eax, 0x7);
        pmovsxbw(xmm5, xmm0);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm6, xmm6);
        pmovsxwd(xmm6, xmm6);
        paddd(xmm9, xmm6);
        movdqu(xword[B - 0x80], xmm0);
        mov(ax, word[A2 - 0x80]);
        pinsrw(xmm0, eax, 0x0);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        pinsrw(xmm0, eax, 0x1);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        pinsrw(xmm0, eax, 0x2);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        pinsrw(xmm0, eax, 0x3);
        mov(ax, word[A2 - 0x80]);
        pinsrw(xmm0, eax, 0x4);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        pinsrw(xmm0, eax, 0x5);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        pinsrw(xmm0, eax, 0x6);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        pinsrw(xmm0, eax, 0x7);
        pmovsxbw(xmm5, xmm0);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm10, xmm5);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm6, xmm6);
        pmovsxwd(xmm6, xmm6);
        paddd(xmm11, xmm6);
        movdqu(xword[B - 0x70], xmm0);
        sub(A1, -2);
        sub(B, -32);
        align(4);

        L(l2b5c);
        test(M, 0x1);
        jle(l2c64, T_NEAR);
        mov(al, byte[A1 - 0x80]);
        pinsrb(xmm0, eax, 0x0);
        mov(al, byte[A1 + LDA * 1 - 0x80]);
        pinsrb(xmm0, eax, 0x1);
        mov(al, byte[A1 + LDA * 2 - 0x80]);
        pinsrb(xmm0, eax, 0x2);
        mov(al, byte[A1 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A1 + LDA * 4]);
        pinsrb(xmm0, eax, 0x3);
        mov(al, byte[A2 - 0x80]);
        pinsrb(xmm0, eax, 0x4);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        pinsrb(xmm0, eax, 0x5);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        pinsrb(xmm0, eax, 0x6);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        pinsrb(xmm0, eax, 0x7);
        mov(al, byte[A2 - 0x80]);
        pinsrb(xmm0, eax, 0x8);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        pinsrb(xmm0, eax, 0x9);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        pinsrb(xmm0, eax, 0xa);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        lea(A2, ptr[A2 + LDA * 4]);
        pinsrb(xmm0, eax, 0xb);
        mov(al, byte[A2 - 0x80]);
        pinsrb(xmm0, eax, 0xc);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        pinsrb(xmm0, eax, 0xd);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        pinsrb(xmm0, eax, 0xe);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        pinsrb(xmm0, eax, 0xf);
        pmovsxbd(xmm5, xmm0);
        paddd(xmm8, xmm5);
        pshufd(xmm6, xmm0, 0x55);
        pmovsxbd(xmm6, xmm6);
        paddd(xmm9, xmm6);
        pshufd(xmm5, xmm0, 0xaa);
        pmovsxbd(xmm5, xmm5);
        paddd(xmm10, xmm5);
        pshufd(xmm6, xmm0, 0xff);
        pmovsxbd(xmm6, xmm6);
        paddd(xmm11, xmm6);
        movdqu(xword[B - 0x80], xmm0);
        sub(B, -16);
        align(4);

        L(l2c64);
        mov(A1, qword[ARG_BIAS]);
        movdqu(xword[A1], xmm8);
        movdqu(xword[A1 + 0x10], xmm9);
        movdqu(xword[A1 + 0x20], xmm10);
        movdqu(xword[A1 + 0x30], xmm11);
        add(qword[ARG_BIAS], 0x40);
        sub(N, 0x10);
        cmp(N, 0x10);
        jge(l22c4, T_NEAR);
        align(4);

        L(l2c94);
        cmp(N, 0x8);
        jl(l31c0, T_NEAR);
        align(4);

        L(l2ca0);
        mov(A1, A);
        lea(A2, ptr[A1 + LDA * 4]);
        lea(I, ptr[A1 + LDA * 8]);
        mov(A, I);
        pxor(xmm8, xmm8);
        pxor(xmm9, xmm9);
        mov(I, M);
        sar(I, 0x4);
        jle(l2eac, T_NEAR);
        align(4);

        L(l2cc8);
        movdqu(xmm0, xword[A1 - 0x80]);
        movdqu(xmm1, xword[A1 + LDA * 1 - 0x80]);
        movdqu(xmm2, xword[A1 + LDA * 2 - 0x80]);
        movdqu(xmm3, xword[A1 + LDA3 * 1 - 0x80]);
        sub(A1, -16);
        movdqa(xmm4, xmm0);
        punpckldq(xmm0, xmm1);
        punpckhdq(xmm4, xmm1);
        movdqa(xmm5, xmm2);
        punpckldq(xmm2, xmm3);
        punpckhdq(xmm5, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        movdqa(xmm3, xmm4);
        punpcklqdq(xmm4, xmm5);
        punpckhqdq(xmm3, xmm5);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B - 0x80], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B - 0x60], xmm1);
        pmovsxbw(xmm5, xmm4);
        movhlps(xmm6, xmm4);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B - 0x40], xmm4);
        pmovsxbw(xmm5, xmm3);
        movhlps(xmm6, xmm3);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B - 0x20], xmm3);
        movdqu(xmm0, xword[A2 - 0x80]);
        movdqu(xmm1, xword[A2 + LDA * 1 - 0x80]);
        movdqu(xmm2, xword[A2 + LDA * 2 - 0x80]);
        movdqu(xmm3, xword[A2 + LDA3 * 1 - 0x80]);
        sub(A2, -16);
        movdqa(xmm4, xmm0);
        punpckldq(xmm0, xmm1);
        punpckhdq(xmm4, xmm1);
        movdqa(xmm5, xmm2);
        punpckldq(xmm2, xmm3);
        punpckhdq(xmm5, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        movdqa(xmm3, xmm4);
        punpcklqdq(xmm4, xmm5);
        punpckhqdq(xmm3, xmm5);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B - 0x70], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B - 0x50], xmm1);
        pmovsxbw(xmm5, xmm4);
        movhlps(xmm6, xmm4);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B - 0x30], xmm4);
        pmovsxbw(xmm5, xmm3);
        movhlps(xmm6, xmm3);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B - 0x10], xmm3);
        sub(B, -128);
        dec(I);
        jg(l2cc8, T_NEAR);
        align(4);

        L(l2eac);
        test(M, 0x8);
        jle(l2fc0, T_NEAR);
        movq(xmm0, qword[A1 - 0x80]);
        movq(xmm1, qword[A1 + LDA * 1 - 0x80]);
        movq(xmm2, qword[A1 + LDA * 2 - 0x80]);
        movq(xmm3, qword[A1 + LDA3 * 1 - 0x80]);
        sub(A1, -8);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B - 0x80], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B - 0x60], xmm1);
        movq(xmm0, qword[A2 - 0x80]);
        movq(xmm1, qword[A2 + LDA * 1 - 0x80]);
        movq(xmm2, qword[A2 + LDA * 2 - 0x80]);
        movq(xmm3, qword[A2 + LDA3 * 1 - 0x80]);
        sub(A2, -8);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B - 0x70], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B - 0x50], xmm1);
        sub(B, -64);
        align(4);

        L(l2fc0);
        test(M, 0x4);
        jle(l3078, T_NEAR);
        movd(xmm0, dword[A1 - 0x80]);
        movd(xmm1, dword[A1 + LDA * 1 - 0x80]);
        movd(xmm2, dword[A1 + LDA * 2 - 0x80]);
        movd(xmm3, dword[A1 + LDA3 * 1 - 0x80]);
        sub(A1, -4);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        punpcklqdq(xmm0, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movdqu(xword[B - 0x80], xmm0);
        movd(xmm0, dword[A2 - 0x80]);
        movd(xmm1, dword[A2 + LDA * 1 - 0x80]);
        movd(xmm2, dword[A2 + LDA * 2 - 0x80]);
        movd(xmm3, dword[A2 + LDA3 * 1 - 0x80]);
        sub(A2, -4);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        punpcklqdq(xmm0, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm9, xmm5);
        movdqu(xword[B - 0x70], xmm0);
        sub(B, -32);
        align(4);

        L(l3078);
        test(M, 0x2);
        jle(l3118, T_NEAR);
        mov(ax, word[A1 - 0x80]);
        pinsrw(xmm0, eax, 0x0);
        mov(ax, word[A1 + LDA * 1 - 0x80]);
        pinsrw(xmm0, eax, 0x1);
        mov(ax, word[A1 + LDA * 2 - 0x80]);
        pinsrw(xmm0, eax, 0x2);
        mov(ax, word[A1 + LDA3 * 1 - 0x80]);
        sub(A1, -2);
        pinsrw(xmm0, eax, 0x3);
        mov(ax, word[A2 - 0x80]);
        pinsrw(xmm0, eax, 0x4);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        pinsrw(xmm0, eax, 0x5);
        mov(ax, word[A2 + LDA * 2 - 0x80]);
        pinsrw(xmm0, eax, 0x6);
        mov(ax, word[A2 + LDA3 * 1 - 0x80]);
        sub(A2, -2);
        pinsrw(xmm0, eax, 0x7);
        pmovsxbw(xmm5, xmm0);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm8, xmm5);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm6, xmm6);
        pmovsxwd(xmm6, xmm6);
        paddd(xmm9, xmm6);
        movdqu(xword[B - 0x80], xmm0);
        sub(B, -16);
        align(4);

        L(l3118);
        test(M, 0x1);
        jle(l319c, T_NEAR);
        mov(al, byte[A1 - 0x80]);
        pinsrb(xmm0, eax, 0x0);
        mov(al, byte[A1 + LDA * 1 - 0x80]);
        pinsrb(xmm0, eax, 0x1);
        mov(al, byte[A1 + LDA * 2 - 0x80]);
        pinsrb(xmm0, eax, 0x2);
        mov(al, byte[A1 + LDA3 * 1 - 0x80]);
        pinsrb(xmm0, eax, 0x3);
        mov(al, byte[A2 - 0x80]);
        pinsrb(xmm0, eax, 0x4);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        pinsrb(xmm0, eax, 0x5);
        mov(al, byte[A2 + LDA * 2 - 0x80]);
        pinsrb(xmm0, eax, 0x6);
        mov(al, byte[A2 + LDA3 * 1 - 0x80]);
        pinsrb(xmm0, eax, 0x7);
        pmovsxbd(xmm5, xmm0);
        pshufd(xmm6, xmm0, 0x55);
        pmovsxbd(xmm6, xmm6);
        paddd(xmm8, xmm5);
        paddd(xmm9, xmm6);
        movq(qword[B - 0x80], xmm0);
        sub(B, -8);
        align(4);

        L(l319c);
        mov(A1, qword[ARG_BIAS]);
        movdqu(xword[A1], xmm8);
        movdqu(xword[A1 + 0x10], xmm9);
        add(qword[ARG_BIAS], 0x20);
        sub(N, 0x8);
        cmp(N, 0x8);
        jge(l2ca0, T_NEAR);
        align(4);

        L(l31c0);
        cmp(N, 0x4);
        jl(l349c, T_NEAR);
        align(4);

        L(l31cc);
        mov(A1, A);
        lea(A2, ptr[A1 + LDA * 2]);
        lea(I, ptr[A1 + LDA * 4]);
        mov(A, I);
        pxor(xmm7, xmm7);
        mov(I, M);
        sar(I, 0x4);
        jle(l32e4, T_NEAR);
        align(4);

        L(l31ec);
        movdqu(xmm0, xword[A1 - 0x80]);
        movdqu(xmm1, xword[A1 + LDA * 1 - 0x80]);
        sub(A1, -16);
        movdqu(xmm2, xword[A2 - 0x80]);
        movdqu(xmm3, xword[A2 + LDA * 1 - 0x80]);
        sub(A2, -16);
        movdqa(xmm4, xmm0);
        punpckldq(xmm0, xmm1);
        punpckhdq(xmm4, xmm1);
        movdqa(xmm5, xmm2);
        punpckldq(xmm2, xmm3);
        punpckhdq(xmm5, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        movdqa(xmm3, xmm4);
        punpcklqdq(xmm4, xmm5);
        punpckhqdq(xmm3, xmm5);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm7, xmm5);
        movdqu(xword[B - 0x80], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm7, xmm5);
        movdqu(xword[B - 0x70], xmm1);
        pmovsxbw(xmm5, xmm4);
        movhlps(xmm6, xmm4);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm7, xmm5);
        movdqu(xword[B - 0x60], xmm4);
        pmovsxbw(xmm5, xmm3);
        movhlps(xmm6, xmm3);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm7, xmm5);
        movdqu(xword[B - 0x50], xmm3);
        sub(B, -64);
        dec(I);
        jg(l31ec, T_NEAR);
        align(4);

        L(l32e4);
        test(M, 0x8);
        jle(l3378, T_NEAR);
        movq(xmm0, qword[A1 - 0x80]);
        movq(xmm1, qword[A1 + LDA * 1 - 0x80]);
        sub(A1, -8);
        movq(xmm2, qword[A2 - 0x80]);
        movq(xmm3, qword[A2 + LDA * 1 - 0x80]);
        sub(A2, -8);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        movdqa(xmm1, xmm0);
        punpcklqdq(xmm0, xmm2);
        punpckhqdq(xmm1, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm7, xmm5);
        movdqu(xword[B - 0x80], xmm0);
        pmovsxbw(xmm5, xmm1);
        movhlps(xmm6, xmm1);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm7, xmm5);
        movdqu(xword[B - 0x70], xmm1);
        sub(B, -32);
        align(4);

        L(l3378);
        test(M, 0x4);
        jle(l33dc, T_NEAR);
        movd(xmm0, dword[A1 - 0x80]);
        movd(xmm1, dword[A1 + LDA * 1 - 0x80]);
        sub(A1, -4);
        movd(xmm2, dword[A2 - 0x80]);
        movd(xmm3, dword[A2 + LDA * 1 - 0x80]);
        sub(A2, -4);
        punpckldq(xmm0, xmm1);
        punpckldq(xmm2, xmm3);
        punpcklqdq(xmm0, xmm2);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm7, xmm5);
        movdqu(xword[B - 0x80], xmm0);
        sub(B, -16);
        align(4);

        L(l33dc);
        test(M, 0x2);
        jle(l3434, T_NEAR);
        mov(ax, word[A1 - 0x80]);
        pinsrw(xmm0, eax, 0x0);
        mov(ax, word[A1 + LDA * 1 - 0x80]);
        sub(A1, -2);
        pinsrw(xmm0, eax, 0x1);
        mov(ax, word[A2 - 0x80]);
        pinsrw(xmm0, eax, 0x2);
        mov(ax, word[A2 + LDA * 1 - 0x80]);
        sub(A2, -2);
        pinsrw(xmm0, eax, 0x3);
        pmovsxbw(xmm5, xmm0);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm7, xmm5);
        movq(qword[B - 0x80], xmm0);
        sub(B, -8);
        align(4);

        L(l3434);
        test(M, 0x1);
        jle(l347c, T_NEAR);
        mov(al, byte[A1 - 0x80]);
        pinsrb(xmm0, eax, 0x0);
        mov(al, byte[A1 + LDA * 1 - 0x80]);
        pinsrb(xmm0, eax, 0x1);
        mov(al, byte[A2 - 0x80]);
        pinsrb(xmm0, eax, 0x2);
        mov(al, byte[A2 + LDA * 1 - 0x80]);
        pinsrb(xmm0, eax, 0x3);
        pmovsxbd(xmm5, xmm0);
        paddd(xmm7, xmm5);
        movd(dword[B - 0x80], xmm0);
        sub(B, -4);
        align(4);

        L(l347c);
        mov(A1, qword[ARG_BIAS]);
        movdqu(xword[A1], xmm7);
        add(qword[ARG_BIAS], 0x10);
        sub(N, 0x4);
        cmp(N, 0x4);
        jge(l31cc, T_NEAR);
        align(4);

        L(l349c);
        cmp(N, 0x2);
        jl(l368a, T_NEAR);
        align(4);

        L(l34a8);
        mov(A1, A);
        lea(A2, ptr[A1 + LDA * 1]);
        lea(I, ptr[A1 + LDA * 2]);
        mov(A, I);
        pxor(xmm7, xmm7);
        mov(I, M);
        sar(I, 0x4);
        jle(l3558, T_NEAR);
        align(4);

        L(l34c8);
        movdqu(xmm0, xword[A1 - 0x80]);
        sub(A1, -16);
        movdqu(xmm1, xword[A2 - 0x80]);
        sub(A2, -16);
        movdqa(xmm2, xmm0);
        punpckldq(xmm0, xmm1);
        punpckhdq(xmm2, xmm1);
        pshufd(xmm6, xmm0, 0xd8);
        pmovsxbw(xmm5, xmm6);
        movhlps(xmm6, xmm6);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm7, xmm5);
        movdqu(xword[B - 0x80], xmm0);
        pshufd(xmm6, xmm2, 0xd8);
        pmovsxbw(xmm5, xmm6);
        movhlps(xmm6, xmm6);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm7, xmm5);
        movdqu(xword[B - 0x70], xmm2);
        sub(B, -32);
        dec(I);
        jg(l34c8, T_NEAR);
        align(4);

        L(l3558);
        test(M, 0x8);
        jle(l35b0, T_NEAR);
        movq(xmm0, qword[A1 - 0x80]);
        sub(A1, -8);
        movq(xmm1, qword[A2 - 0x80]);
        sub(A2, -8);
        punpckldq(xmm0, xmm1);
        pshufd(xmm6, xmm0, 0xd8);
        pmovsxbw(xmm5, xmm6);
        movhlps(xmm6, xmm6);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm7, xmm5);
        movdqu(xword[B - 0x80], xmm0);
        sub(B, -16);
        align(4);

        L(l35b0);
        test(M, 0x4);
        jle(l35f4, T_NEAR);
        movd(xmm0, dword[A1 - 0x80]);
        sub(A1, -4);
        movd(xmm1, dword[A2 - 0x80]);
        sub(A2, -4);
        punpckldq(xmm0, xmm1);
        pmovsxbw(xmm5, xmm0);
        phaddw(xmm5, xmm5);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm7, xmm5);
        movq(qword[B - 0x80], xmm0);
        sub(B, -8);
        align(4);

        L(l35f4);
        test(M, 0x2);
        jle(l3638, T_NEAR);
        mov(ax, word[A1 - 0x80]);
        sub(A1, -2);
        pinsrw(xmm0, eax, 0x0);
        mov(ax, word[A2 - 0x80]);
        sub(A2, -2);
        pinsrw(xmm0, eax, 0x1);
        pmovsxbw(xmm5, xmm0);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm7, xmm5);
        movd(dword[B - 0x80], xmm0);
        sub(B, -4);
        align(4);

        L(l3638);
        test(M, 0x1);
        jle(l366c, T_NEAR);
        mov(al, byte[A1 - 0x80]);
        pinsrb(xmm0, eax, 0x0);
        mov(byte[B - 0x80], al);
        mov(al, byte[A2 - 0x80]);
        pinsrb(xmm0, eax, 0x1);
        mov(byte[B - 0x7f], al);
        sub(B, -2);
        pmovsxbd(xmm5, xmm0);
        paddd(xmm7, xmm5);
        align(4);

        L(l366c);
        mov(A1, qword[ARG_BIAS]);
        movq(qword[A1], xmm7);
        add(qword[ARG_BIAS], 0x8);
        sub(N, 0x2);
        cmp(N, 0x2);
        jge(l34a8, T_NEAR);
        align(4);

        L(l368a);
        cmp(N, 0x1);
        jl(l37d8, T_NEAR);
        align(4);

        L(l3694);
        mov(A1, A);
        add(A, LDA);
        pxor(xmm7, xmm7);
        mov(I, M);
        sar(I, 0x4);
        jle(l36ec, T_NEAR);
        align(4);

        L(l36a8);
        movdqu(xmm0, xword[A1 - 0x80]);
        sub(A1, -16);
        pmovsxbw(xmm5, xmm0);
        movhlps(xmm6, xmm0);
        pmovsxbw(xmm6, xmm6);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        phaddw(xmm5, xmm5);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm7, xmm5);
        movdqu(xword[B - 0x80], xmm0);
        sub(B, -16);
        dec(I);
        jg(l36a8, T_NEAR);
        align(4);

        L(l36ec);
        test(M, 0x8);
        jle(l3728, T_NEAR);
        movq(xmm0, qword[A1 - 0x80]);
        sub(A1, -8);
        pmovsxbw(xmm5, xmm0);
        phaddw(xmm5, xmm6);
        phaddw(xmm5, xmm5);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm7, xmm5);
        movq(qword[B - 0x80], xmm0);
        sub(B, -8);
        align(4);

        L(l3728);
        test(M, 0x4);
        jle(l3760, T_NEAR);
        movd(xmm0, dword[A1 - 0x80]);
        sub(A1, -4);
        pmovsxbw(xmm5, xmm0);
        phaddw(xmm5, xmm5);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm7, xmm5);
        movd(dword[B - 0x80], xmm0);
        sub(B, -4);
        align(4);

        L(l3760);
        test(M, 0x2);
        jle(l3794, T_NEAR);
        mov(ax, word[A1 - 0x80]);
        pinsrw(xmm0, eax, 0x0);
        pmovsxbw(xmm5, xmm0);
        phaddw(xmm5, xmm5);
        pmovsxwd(xmm5, xmm5);
        paddd(xmm7, xmm5);
        mov(word[B - 0x80], ax);
        sub(A1, -2);
        sub(B, -2);
        align(4);

        L(l3794);
        test(M, 0x1);
        jle(l37b8, T_NEAR);
        mov(al, byte[A1 - 0x80]);
        pinsrb(xmm0, eax, 0x0);
        pmovsxbd(xmm5, xmm0);
        paddd(xmm7, xmm5);
        mov(byte[B - 0x80], al);
        sub(B, -1);
        align(4);

        L(l37b8);
        mov(A1, qword[ARG_BIAS]);
        movd(dword[A1], xmm7);
        add(qword[ARG_BIAS], 0x4);
        sub(N, 0x1);
        cmp(N, 0x1);
        jge(l3694, T_NEAR);
        align(4);

        L(l37d8);

        postamble();
    }
    outLocalLabel();

#undef M
#undef N
#undef A
#undef LDA
#undef ALPHA
#undef B
#undef I
#undef A1
#undef A2
#undef LDA3
#ifdef _WIN32
#undef ARG_ALPHA
#undef ARG_B
#endif
#undef ARG_BIAS
}

} // namespace x64
} // namespace cpu
} // namespace impl
} // namespace dnnl
